## Data Cleaning
# load raw data files
data = read.csv("../data/filledDatabase111119NUMONLY.csv")
# clean data
data = clean_data(data)
# separate compound and group_cate from the predictors
compound = data$Compound
group_cat = data$GroupCat
X = data$X
group_cat_text = paste("Grp", group_cat)
data = select(data, -c("Compound","GroupCat","X"))
summary(X) %>%
as.data.frame() %>%
set_colnames("Frequency") %>%
rownames_to_column("X") %>%
arrange(desc(Frequency)) %>%
t() %>%
kable(caption = "Frequency table for cluster X of data version 11/11/19") %>%
kable_styling(bootstrap_options = "striped")
| X | O | F | Cl | Br | I | S | Se | l | Te | N |
| Frequency | 259 | 71 | 50 | 25 | 21 | 7 | 4 | 3 | 1 | 0 |
# data slicing
rows_to_take = X == "O"
subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]
# variables in PC space
fviz_pca_var(
prcomp(subset, scale = TRUE),
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Figure 1: Cluster O: vectors of predictors in the space of PC1 and PC2
set_color = c("#0071C3","#DE501A","#EEB020","#7E2E8E","#79AC2C","#4DBDF7","#A51331") %>%
rep(10)
# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()
# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
geom_text(aes(label=Compound, color=GroupCat), size = 3) +
scale_color_manual(values=set_color) +
scale_fill_manual(values=set_color) +
scale_shape_manual(values=1:11) +
theme_minimal()
Figure 2: Cluster O: compounds in the space of the first two PC’s
rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
# individual
label = "var", labelsize = 4,
geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
pointsize = 2, pointshape = 21, palette = set_color[1:11],
# variable
col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
labs(fill = "Group Cat", color = "Contrib")
Figure 3: Cluster O: compounds and predictor vectors in the space of the first two PC’s
# data slicing
rows_to_take = X == "F"
subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]
# variables in PC space
fviz_pca_var(
prcomp(subset, scale = TRUE),
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Figure 4: Cluster F: vectors of predictors in the space of PC1 and PC2
# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()
# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
geom_text(aes(label=Compound, color=GroupCat), size = 3) +
scale_color_manual(values=set_color) +
scale_fill_manual(values=set_color) +
scale_shape_manual(values=1:11) +
theme_minimal()
Figure 5: Cluster F: compounds in the space of the first two PC’s
rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
# individual
label = "var", labelsize = 4,
geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
pointsize = 2, pointshape = 21, palette = set_color[1:11],
# variable
col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
labs(fill = "Group Cat", color = "Contrib")
Figure 6: Cluster F: compounds and predictor vectors in the space of the first two PC’s
# data slicing
rows_to_take = X == "Cl"
subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]
# variables in PC space
fviz_pca_var(
prcomp(subset, scale = TRUE),
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Figure 7: Cluster Cl: vectors of predictors in the space of PC1 and PC2
# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()
# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
geom_text(aes(label=Compound, color=GroupCat), size = 3) +
scale_color_manual(values=set_color) +
scale_fill_manual(values=set_color) +
scale_shape_manual(values=1:11) +
theme_minimal()
Figure 8: Cluster Cl: compounds in the space of the first two PC’s
rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
# individual
label = "var", labelsize = 4,
geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
pointsize = 2, pointshape = 21, palette = set_color[1:11],
# variable
col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
labs(fill = "Group Cat", color = "Contrib")
Figure 9: Cluster Cl: compounds and predictor vectors in the space of the first two PC’s
# data slicing
rows_to_take = X == "Br"
subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]
# variables in PC space
fviz_pca_var(
prcomp(subset, scale = TRUE),
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Figure 10: Cluster Br: vectors of predictors in the space of PC1 and PC2
# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()
# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
geom_text(aes(label=Compound, color=GroupCat), size = 3) +
scale_color_manual(values=set_color) +
scale_fill_manual(values=set_color) +
scale_shape_manual(values=1:11) +
theme_minimal()
Figure 11: Cluster Br: compounds in the space of the first two PC’s
rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
# individual
label = "var", labelsize = 4,
geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
pointsize = 2, pointshape = 21, palette = set_color[1:11],
# variable
col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
labs(fill = "Group Cat", color = "Contrib")
Figure 12: Cluster Br: compounds and predictor vectors in the space of the first two PC’s
# data slicing
rows_to_take = X == "I"
subset = data[rows_to_take, ]
compound_sub = compound[rows_to_take]
group_cat_sub = group_cat[rows_to_take]
group_cat_text_sub = group_cat_text[rows_to_take]
# variables in PC space
fviz_pca_var(
prcomp(subset, scale = TRUE),
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Figure 13: Cluster I: vectors of predictors in the space of PC1 and PC2
# extract three two PC's
data_pca = get_pc_space(subset, k = 3) %>% scale()
# plot
data.frame(Compound = compound_sub, GroupCat = group_cat_text_sub, data_pca) %>%
ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
geom_text(aes(label=Compound, color=GroupCat), size = 3) +
scale_color_manual(values=set_color) +
scale_fill_manual(values=set_color) +
scale_shape_manual(values=1:11) +
theme_minimal()
Figure 14: Cluster I: compounds in the space of the first two PC’s
rownames(subset) = make.names(compound_sub, unique=TRUE)
fit <- prcomp(subset, scale = TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
# individual
label = "var", labelsize = 4,
geom = c("point","text"), fill.ind = group_cat_text_sub, alpha.ind = 0.7,
pointsize = 2, pointshape = 21, palette = set_color[1:11],
# variable
col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
labs(fill = "Group Cat", color = "Contrib")
Figure 15: Cluster I: compounds and predictor vectors in the space of the first two PC’s